# Import libraries
library(ggplot2)
library(knitr)
library(kableExtra)
# library(foreach)
# library(doParallel)
# library(tictoc)
library(htmltools)
library(limma) # Bioconductor
library(DESeq2) # Bioconductor
library(dplyr) # R package
library(latex2exp)
library(RColorBrewer)
library(gplots)
library(calibrate)
library(ggrepel)
baseWD <- "./" #paste0(getwd(),"/")
First we read in counts data inside the counts.matrix file, the experimental design (exp_design.csv), and the gene annotation table (gene_annotation.txt).
exp_design <- read.table(paste0(baseWD,"exp_design.csv"), header=T, sep=",")
countData <- read.table(paste0(baseWD,"counts.matrix"), header=T, sep="\t", row.names=1)
# get rid of rows with zero values
countData <- countData[apply(countData, 1, function(countData) !all(countData==0)),]
# Load annotation table
dbs <- read.table(paste0(baseWD,"gene_annotation.txt"), header=FALSE, sep="\t")
dbs <- dbs[,c(9, 11, 13)]
dbs <- sapply(dbs, function(x){gsub("gene_id|gene_name|gene_biotype| ", "", x)})
colnames(dbs) <- c("Geneid", "Gene", "Class")
dbs <- data.frame(dbs)
| SRR1958165 | SRR1958166 | SRR1958167 | SRR1958168 | SRR1958169 | SRR1958170 | |
|---|---|---|---|---|---|---|
| ENSG00000227232 | 2 | 1 | 3 | 7 | 10 | 5 |
| ENSG00000238009 | 1 | 0 | 0 | 0 | 0 | 1 |
| ENSG00000233750 | 2 | 0 | 2 | 3 | 1 | 1 |
| ENSG00000268903 | 36 | 43 | 30 | 33 | 29 | 37 |
| ENSG00000269981 | 39 | 28 | 38 | 52 | 43 | 19 |
| samples | labels | condition |
|---|---|---|
| SRR1958165 | control1 | control |
| SRR1958166 | control2 | control |
| SRR1958167 | FOXA2KD1 | FOXA2 |
| SRR1958168 | FOXA2KD2 | FOXA2 |
| SRR1958169 | DEANR1KD1 | DEANR1 |
| SRR1958170 | DEANR1KD2 | DEANR1 |
| Geneid | Gene | Class |
|---|---|---|
| ENSG00000223972 | DDX11L1 | transcribed_unprocessed_pseudogene |
| ENSG00000227232 | WASH7P | unprocessed_pseudogene |
| ENSG00000278267 | MIR6859-1 | miRNA |
| ENSG00000243485 | MIR1302-2HG | lncRNA |
| ENSG00000284332 | MIR1302-2 | miRNA |
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | |
|---|---|---|---|---|---|---|
| SRR1958165 | 0 | 1 | 8 | 548.4 | 317.0 | 130506 |
| SRR1958166 | 0 | 1 | 8 | 573.6 | 322.0 | 199856 |
| SRR1958167 | 0 | 1 | 7 | 595.7 | 298.0 | 146246 |
| SRR1958168 | 0 | 1 | 7 | 556.1 | 293.0 | 182955 |
| SRR1958169 | 0 | 1 | 7 | 583.6 | 300.0 | 150419 |
| SRR1958170 | 0 | 0 | 7 | 574.0 | 288.5 | 261800 |
barplot(colSums(countData)/1e6, col="#66BD63",las=1,main="Total read counts (millions)", ylab="Total reads in millions")
transformation <- paste0("Log_2(",colnames(countData)[1],")")
hist(log2(countData[,1]),br=200, xlab = TeX(transformation), col="#08519C",
main = TeX(paste0("Histogram of ",transformation)))
logCountData = log2(1 + countData)
par(mfrow = c(1, 2), mar = c(8,2,2,2)) # two columns
hist(logCountData[,1], xlab = TeX(transformation),
main = TeX(paste0("Histogram of ",transformation)))
boxplot(logCountData,las=3)
x <- logCountData
myColors <- brewer.pal(dim(x)[2],"Spectral")#rainbow(dim(x)[2])
plot(density(x[,1]),col = myColors[1], lwd=2,
xlab="Expression values", ylab="Density", main= "Distribution of transformed data",
ylim=c(0, max(density(x[,1])$y)+.02 ) )
for(i in 2:dim(x)[2] )
lines(density(x[,i]),col=myColors[i], lwd=2 )
legend("topright", cex=1.1,colnames(x), lty=rep(1,dim(x)[2]), col=myColors)
xlab <- paste0("Log_2(",colnames(countData)[1],")")
ylab <- paste0("Log_2(",colnames(countData)[2],")")
plot(logCountData[,1],logCountData[,2],
xlab = TeX(xlab),
ylab = TeX(ylab),
main = TeX(paste0(ylab, " vs ",xlab)))
Using the condition column from the experimental design
condition <- exp_design$condition
countdata <- as.matrix(countData)
coldata <- data.frame(row.names=colnames(countdata), condition)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
# Run the DESeq pipeline
dds <- DESeq(dds)
# Plot dispersions
plotDispEsts(dds, main="")
dds <- dds[ rowSums(counts(dds)) > 5, ]
Regularized log transformation
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlog(dds, blind = FALSE)
kable(head(assay(rld), 5), booktabs = TRUE) %>%
kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
| SRR1958165 | SRR1958166 | SRR1958167 | SRR1958168 | SRR1958169 | SRR1958170 | |
|---|---|---|---|---|---|---|
| ENSG00000227232 | 2.1387750 | 2.1182630 | 2.1570646 | 2.2370556 | 2.2845222 | 2.2005411 |
| ENSG00000238009 | -1.4859428 | -1.5003799 | -1.5003351 | -1.4998930 | -1.5001465 | -1.4855680 |
| ENSG00000233750 | 0.5784663 | 0.5392201 | 0.5778457 | 0.5997311 | 0.5593197 | 0.5602591 |
| ENSG00000268903 | 5.1091409 | 5.1839010 | 5.0213992 | 5.0898451 | 5.0185675 | 5.1467081 |
| ENSG00000269981 | 5.1816560 | 5.0226782 | 5.1607847 | 5.3581017 | 5.2356108 | 4.9106334 |
Variance-stabilizing transformation
vsd <- vst(dds, blind = FALSE)
kable(head(assay(vsd), 5), booktabs = TRUE) %>%
kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
| SRR1958165 | SRR1958166 | SRR1958167 | SRR1958168 | SRR1958169 | SRR1958170 | |
|---|---|---|---|---|---|---|
| ENSG00000227232 | 7.978207 | 7.938153 | 8.006709 | 8.099521 | 8.143688 | 8.061025 |
| ENSG00000238009 | 7.939243 | 7.845119 | 7.845119 | 7.845119 | 7.845119 | 7.941747 |
| ENSG00000233750 | 7.978207 | 7.845119 | 7.977079 | 8.011787 | 7.939686 | 7.941747 |
| ENSG00000268903 | 8.406419 | 8.450830 | 8.353740 | 8.394871 | 8.351862 | 8.429002 |
| ENSG00000269981 | 8.429036 | 8.335134 | 8.416776 | 8.532870 | 8.460670 | 8.264907 |
dds <- estimateSizeFactors(dds)
kable(t(sizeFactors(dds)), booktabs = TRUE) %>%
kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
| SRR1958165 | SRR1958166 | SRR1958167 | SRR1958168 | SRR1958169 | SRR1958170 |
|---|---|---|---|---|---|
| 1.021349 | 1.045443 | 1.038893 | 0.9764818 | 1.011798 | 0.9690877 |
Usings the normalized=TRUE option in the counts() method of DESeq2, we adjust for different library sizes.
slog <- log2(counts(dds, normalized=TRUE)+1)
kable(head(slog, 5), booktabs = TRUE) %>%
kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
| SRR1958165 | SRR1958166 | SRR1958167 | SRR1958168 | SRR1958169 | SRR1958170 | |
|---|---|---|---|---|---|---|
| ENSG00000227232 | 1.5647169 | 0.968299 | 1.958913 | 3.030088 | 3.4440569 | 2.622811 |
| ENSG00000238009 | 0.9848425 | 0.000000 | 0.000000 | 0.000000 | 0.0000000 | 1.022828 |
| ENSG00000233750 | 1.5647169 | 0.000000 | 1.548499 | 2.025828 | 0.9915642 | 1.022828 |
| ENSG00000268903 | 5.1798097 | 5.396807 | 4.900958 | 5.120800 | 4.8905366 | 5.292054 |
| ENSG00000269981 | 5.2922220 | 4.796126 | 5.231794 | 5.761615 | 5.4428972 | 4.364997 |
par(mfrow = c(1, 3)) # 3 columns
plot(slog[,1],slog[,2])
plot(assay(rld)[,1],assay(rld)[,2])
plot(assay(vsd)[,1],assay(vsd)[,2])
df <- bind_rows(
as_tibble(slog[,1:2]) %>%
mutate(transformation = "log2(x + 1)"),
as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"),
as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
colnames(df)[1:2] <- c("x", "y")
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)
mycols <- brewer.pal(length(unique(condition)), "YlGnBu")[1:length(unique(condition))]
mycols[1] <- "#D7191C"
# Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld))))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[condition], RowSideColors=mycols[condition],
margin=c(10, 10), main="Sample Distance Matrix")
pcaData <- plotPCA(rld, intgroup = c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() +
geom_text_repel(aes(PC1, PC2, label = rownames(pcaData)), pcaData) +
scale_color_manual(name="Condition", values = mycols) +
theme(aspect.ratio=1)
k <- 5 # Number of clusters
n <- 100 # number of top genes by standard deviation
x <- assay(rld)
if(n>dim(x)[1]) n = dim(x)[1] # max as data
x <- x[order(apply(x,1,sd),decreasing=TRUE),] # sort genes by standard deviation
x <- x[1:n,] # only keep the n genes
x <- 100* x[1:n,] / apply(x[1:n,],1,function(y) sum(abs(y))) # L1 norm
set.seed(2)
#k = input$k # number of clusters
cl <- kmeans(x,k,iter.max = 50)
hc <- hclust(dist(cl$centers-apply(cl$centers,1,mean) ) )
tem <- match(cl$cluster,hc$order) # new order
x <- x[order(tem),]
bar <- sort(tem)
# heatmap with color bar define gene groups
myheatmap2 <- function (x,bar,n=-1) {
# number of genes to show
ngenes = as.character(table(bar))
if(length(bar) >n && n != -1) {ix = sort( sample(1:length(bar),n) ); bar = bar[ix]; x = x[ix,]}
# this will cutoff very large values, which could skew the color
gene_names <- rownames(x)
x <- as.matrix(x)-apply(x,1,mean)
cutoff <- median(unlist(x)) + 3*sd (unlist(x))
x[x>cutoff] <- cutoff
cutoff <- median(unlist(x)) - 3*sd (unlist(x))
x[x< cutoff] <- cutoff
#colnames(x)= detectGroups(colnames(x))
#row.names(x) <- gene_names
heatmap.2(x, Rowv =F,Colv=F, dendrogram ="none",
col=colorpanel(75, "black", "white"),
#col=greenred(75),
labRow = gene_names,
cexRow=0.6,
density.info="none",
trace="none", scale="none", keysize=.3,
key=F,RowSideColors = mycolors[bar],
margins = c(8, 24), srtCol=45
)
legend.text = paste("Cluster ", toupper(letters)[unique(bar)], " (N=", ngenes,")", sep="")
par(lend = 1) # square line ends for the color legend
legend("topright", # location of the legend on the heatmap plot
legend = legend.text, # category labels
col = mycolors, # color key
lty= 1, # line style
lwd = 10 # line width
)
}
mycolors <- brewer.pal(k,"YlGnBu")
mycolors[1] <- "#D7191C"
myheatmap2(x-apply(x,1,mean), bar,1000)
Developed by Roberto Villegas-Diaz
Villegas.Roberto@hotmail.com